require(readr)
## Loading required package: readr
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(zipcode)
## Loading required package: zipcode
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'zipcode'
require(tidyverse)
## Loading required package: tidyverse
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
require(ggmap)
## Loading required package: ggmap
require(lubridate)
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.4.2
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
require(viridis)
## Loading required package: viridis
## Loading required package: viridisLite
require(ggthemes)
## Loading required package: ggthemes
require(plotly)
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
nypd.vehicle <- read_csv("NYPD_Motor_Vehicle_Collisions.csv", col_names = T)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   TIME = col_time(format = ""),
##   `ZIP CODE` = col_integer(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   `NUMBER OF PERSONS INJURED` = col_integer(),
##   `NUMBER OF PERSONS KILLED` = col_integer(),
##   `NUMBER OF PEDESTRIANS INJURED` = col_integer(),
##   `NUMBER OF PEDESTRIANS KILLED` = col_integer(),
##   `NUMBER OF CYCLIST INJURED` = col_integer(),
##   `NUMBER OF CYCLIST KILLED` = col_integer(),
##   `NUMBER OF MOTORIST INJURED` = col_integer(),
##   `NUMBER OF MOTORIST KILLED` = col_integer(),
##   `UNIQUE KEY` = col_integer()
## )
## See spec(...) for full column specifications.
summary(nypd.vehicle)
##      DATE               TIME            BOROUGH             ZIP CODE     
##  Length:1152443     Length:1152443    Length:1152443     Min.   :10000   
##  Class :character   Class1:hms        Class :character   1st Qu.:10128   
##  Mode  :character   Class2:difftime   Mode  :character   Median :11205   
##                     Mode  :numeric                       Mean   :10812   
##                                                          3rd Qu.:11236   
##                                                          Max.   :11697   
##                                                          NA's   :323606  
##     LATITUDE        LONGITUDE         LOCATION         ON STREET NAME    
##  Min.   : 0.00    Min.   :-201.36   Length:1152443     Length:1152443    
##  1st Qu.:40.67    1st Qu.: -73.98   Class :character   Class :character  
##  Median :40.72    Median : -73.93   Mode  :character   Mode  :character  
##  Mean   :40.72    Mean   : -73.92                                        
##  3rd Qu.:40.77    3rd Qu.: -73.87                                        
##  Max.   :41.13    Max.   :   0.00                                        
##  NA's   :211564   NA's   :211564                                         
##  CROSS STREET NAME  OFF STREET NAME    NUMBER OF PERSONS INJURED
##  Length:1152443     Length:1152443     Min.   : 0.0000          
##  Class :character   Class :character   1st Qu.: 0.0000          
##  Mode  :character   Mode  :character   Median : 0.0000          
##                                        Mean   : 0.2567          
##                                        3rd Qu.: 0.0000          
##                                        Max.   :43.0000          
##                                                                 
##  NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED
##  Min.   :0.000000         Min.   : 0.00000             
##  1st Qu.:0.000000         1st Qu.: 0.00000             
##  Median :0.000000         Median : 0.00000             
##  Mean   :0.001201         Mean   : 0.05209             
##  3rd Qu.:0.000000         3rd Qu.: 0.00000             
##  Max.   :8.000000         Max.   :28.00000             
##                                                        
##  NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED
##  Min.   :0.000000             Min.   :0.00000          
##  1st Qu.:0.000000             1st Qu.:0.00000          
##  Median :0.000000             Median :0.00000          
##  Mean   :0.000664             Mean   :0.02078          
##  3rd Qu.:0.000000             3rd Qu.:0.00000          
##  Max.   :8.000000             Max.   :4.00000          
##                                                        
##  NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED
##  Min.   :0.00e+00         Min.   : 0.0000           
##  1st Qu.:0.00e+00         1st Qu.: 0.0000           
##  Median :0.00e+00         Median : 0.0000           
##  Mean   :7.98e-05         Mean   : 0.1852           
##  3rd Qu.:0.00e+00         3rd Qu.: 0.0000           
##  Max.   :2.00e+00         Max.   :43.0000           
##                                                     
##  NUMBER OF MOTORIST KILLED CONTRIBUTING FACTOR VEHICLE 1
##  Min.   :0.000000          Length:1152443               
##  1st Qu.:0.000000          Class :character             
##  Median :0.000000          Mode  :character             
##  Mean   :0.000458                                       
##  3rd Qu.:0.000000                                       
##  Max.   :5.000000                                       
##                                                         
##  CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3
##  Length:1152443                Length:1152443               
##  Class :character              Class :character             
##  Mode  :character              Mode  :character             
##                                                             
##                                                             
##                                                             
##                                                             
##  CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5
##  Length:1152443                Length:1152443               
##  Class :character              Class :character             
##  Mode  :character              Mode  :character             
##                                                             
##                                                             
##                                                             
##                                                             
##    UNIQUE KEY      VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2
##  Min.   :     22   Length:1152443      Length:1152443     
##  1st Qu.: 290312   Class :character    Class :character   
##  Median :3212342   Mode  :character    Mode  :character   
##  Mean   :2286022                                          
##  3rd Qu.:3500480                                          
##  Max.   :3789579                                          
##                                                           
##  VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5
##  Length:1152443      Length:1152443      Length:1152443     
##  Class :character    Class :character    Class :character   
##  Mode  :character    Mode  :character    Mode  :character   
##                                                             
##                                                             
##                                                             
## 

1) The most dangerous/safest intersections in NYC

First, we mutate two new variables Num_Injured and Num_Killed which are the sum of all injured or killed people (people in the vehicles, pedestrians, cyclists, motorists) and then group them by cross street. We then summarise by number of accidents, number of injuries and number of deaths for each of the cross streets.

colnames(nypd.vehicle) <- str_replace_all(colnames(nypd.vehicle), " ", "_")
q1 <- nypd.vehicle %>% 
  mutate(num = 1,
         Num_Injured = NUMBER_OF_PERSONS_INJURED + NUMBER_OF_PEDESTRIANS_INJURED + NUMBER_OF_CYCLIST_INJURED + NUMBER_OF_MOTORIST_INJURED,
         Num_Killed = NUMBER_OF_PERSONS_KILLED + NUMBER_OF_PEDESTRIANS_KILLED + NUMBER_OF_CYCLIST_KILLED + NUMBER_OF_MOTORIST_KILLED
         ) %>%
  filter(!is.na(CROSS_STREET_NAME)) %>%
  group_by(CROSS_STREET_NAME) %>%
  summarise(Number_of_Accidents = n(), 
            Num_Injured = sum(Num_Injured),
            Num_Killed = sum(Num_Killed))

We use three different criteria to determine the most dangerous/safest intersections based on Number_of_Accidents, Num_Injured, and Num_Killed. 3rd ave was the most dangerous in terms of accidents and Broadway in terms of number injured and killed. We created tables for the safest and most dangerous intersections, using max of accidents and min of accidents, respectively. Non-busy places such as precincts have the lowest number of non-zero accidents.

knitr::kable(q1[q1$Number_of_Accidents == max(q1$Number_of_Accidents), ], 
             caption = "Most dangerous NYC intersections")
Most dangerous NYC intersections
CROSS_STREET_NAME Number_of_Accidents Num_Injured Num_Killed
3 AVENUE 12116 4547 10
knitr::kable(q1[q1$Num_Injured == max(q1$Num_Injured), ], 
             caption = "Most dangerous NYC intersections")
Most dangerous NYC intersections
CROSS_STREET_NAME Number_of_Accidents Num_Injured Num_Killed
BROADWAY 11871 4714 28
knitr::kable(q1[q1$Num_Killed == max(q1$Num_Killed), ], 
             caption = "Most dangerous NYC intersections")
Most dangerous NYC intersections
CROSS_STREET_NAME Number_of_Accidents Num_Injured Num_Killed
BROADWAY 11871 4714 28
safe <- q1[q1$Number_of_Accidents == min(q1$Number_of_Accidents) & q1$Num_Killed == 0 &
              q1$Num_Injured == 0,]
knitr::kable(head(safe), caption = "Safest NYC intersections")
Safest NYC intersections
CROSS_STREET_NAME Number_of_Accidents Num_Injured Num_Killed
0 1 0 0
043 PCT 1 0 0
1 ave 1 0 0
1 Avenue 1 0 0
1 AVENUE + 2 AVENUE 1 0 0
1 AVENUE TO PLEASANT AVENUE 1 0 0

We then made 2 bar graphs out of this to plot the top 5 most dangerous and safest NYC cross streets, each broken down into 3 bars by Number_of_Accidents, Num_Injured, and Num_Killed. We did this through the use of gather and dodge positioning.

ggplot(data = q1 %>% arrange(-Number_of_Accidents) %>% top_n(5,Number_of_Accidents) %>% 
         gather("Number_of_Accidents", "Num_Injured", "Num_Killed",key = Type, value = Amount),
       aes(x = CROSS_STREET_NAME, y =  Amount, fill = Type)) +
   geom_col(position = "dodge") +
  ggtitle("Most dangerous NYC intersections") + xlab("Cross street") + ylab("number") +
  theme(axis.text.x  = element_text(angle = 0),
        text = element_text(size = 10, hjust=0.5 , vjust = 1, face = "bold"))

From the plot, we can direcly find that the intersection which has the greatest number of injured people is Broadway and the one which has the greatest number of accidents is 3 Avenue, but the number of killed people is hard to see because of its scale relative to the accident and injured quantity. If you look closely though, Broadway definitely has the thickest and most visible green mark for number killed.
ggplot(data = q1 %>% arrange(Number_of_Accidents)  %>% head(5) %>% 
         gather("Number_of_Accidents", "Num_Injured", "Num_Killed",key = Type, value = Amount),
       aes(x = CROSS_STREET_NAME, y =  Amount, fill = Type)) +
   geom_col(position = "dodge") +
  ggtitle("Safest NYC intersections") + xlab("Cross street") + ylab("number") +
  theme(axis.text.x  = element_text(angle = 0),
        text = element_text(size = 10, hjust=0.5, vjust = 1, face = "bold"))

From the plot, we can direcly find that the intersections which have the least number of injured people is 0, 043 PCT and 1 Avenue, and all of the top 5 safest intersections have "1" for number of accidents and "0" for the number of killed people.

2) Distribution of killed people based on the map of NYC

From the plot of the most dangerous intersections of NYC, we noticed that the number of killed people is to small to see from the plot. We use ggmap to get the google map of NYC and figure out the distribution of people killed based on the map of NYC and utilizing the latitude and longitude coordinates. We created a data frame that gave the lat. and long. for each accident in which someone was killed, by omitting ones in which no one was killed with filter.

colnames(nypd.vehicle) <- str_replace_all(colnames(nypd.vehicle), " ", "_")
nypd <- nypd.vehicle %>%
  select(LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_KILLED) %>% 
  gather(type,value,3:3) %>%
  na.omit() %>% 
  group_by(LATITUDE,LONGITUDE,type) %>% 
  summarise(total=sum(value,na.rm=T)) %>% 
  filter(total!=0)
nypd
## # A tibble: 996 x 4
## # Groups:   LATITUDE, LONGITUDE [996]
##    LATITUDE LONGITUDE                     type total
##       <dbl>     <dbl>                    <chr> <int>
##  1 40.50619 -74.23489 NUMBER_OF_PERSONS_KILLED     1
##  2 40.51387 -74.23668 NUMBER_OF_PERSONS_KILLED     1
##  3 40.51662 -74.20279 NUMBER_OF_PERSONS_KILLED     1
##  4 40.52400 -74.18610 NUMBER_OF_PERSONS_KILLED     1
##  5 40.52664 -74.16813 NUMBER_OF_PERSONS_KILLED     1
##  6 40.52937 -74.16135 NUMBER_OF_PERSONS_KILLED     2
##  7 40.53078 -74.22443 NUMBER_OF_PERSONS_KILLED     2
##  8 40.53427 -74.15397 NUMBER_OF_PERSONS_KILLED     2
##  9 40.53866 -74.18800 NUMBER_OF_PERSONS_KILLED     1
## 10 40.54101 -74.14710 NUMBER_OF_PERSONS_KILLED     1
## # ... with 986 more rows
nyc <- get_map("new york",zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=new+york&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=new%20york&sensor=false
map_killed <- ggmap(nyc)+geom_point(data=subset(nypd,type=="NUMBER_OF_PERSONS_KILLED"), 
             aes(x=LONGITUDE, y=LATITUDE, colour=total),size= 2,alpha=0.2) +
  ggtitle("Distribution of deaths")+xlab("Longitude")+ylab("Latitude") + 
  scale_color_gradient(low = "red",  high = "black")
map_killed
## Warning: Removed 125 rows containing missing values (geom_point).

From the map, persons who got killed in the collisions are relatively few compared to persons who got injured. Most of the collisions involving death of people took place in Manhattan, Brooklyn and Queens and then Bronx, while Staten island has the least number of death of people.

3) The probability that people were injured or killed from the collision

We combined the number of people injured and the number of people killed to figure out the proportion of crashes that resulted in an injury or death. We got the proportion by taking the mean of the conditional statement of whether someone got injured or killed.

q4 <- nypd.vehicle %>%
  mutate(Injured_or_Killed = NUMBER_OF_PERSONS_INJURED + NUMBER_OF_PEDESTRIANS_INJURED + NUMBER_OF_CYCLIST_INJURED + NUMBER_OF_MOTORIST_INJURED +NUMBER_OF_PERSONS_KILLED + NUMBER_OF_PEDESTRIANS_KILLED + NUMBER_OF_CYCLIST_KILLED + NUMBER_OF_MOTORIST_KILLED
  ) %>%
  mutate(Injured_or_Killed_Yes = ifelse(Injured_or_Killed > 0, 1, 0))


prop <- mean(q4$Injured_or_Killed_Yes)
prop
## [1] 0.1886957
It was around 19% of people that get injured or killed per crash.

4) Distribution of collisions by boroughs

In order to look at how the average number of accidents varied over time within each of the 5 boroughs throughout the years of the dataset, we filtered the data by the borough variable and then used group_by to organize by the date of the year and borough and summarised an average accidents variable that was the mean of the number of accidents per month in that borough at each of those moments in time. We did this with a geom_line to track the trends and ebbs and flows for each and grouped by the date it occured and the borough it occurred in and summarise the average collisions per month for those groups.

nypd.vehicle$DATE <- mdy(nypd.vehicle$DATE)
borough.accident <- nypd.vehicle %>% filter(BOROUGH!="") %>%  group_by(DATE,BOROUGH) %>% summarise(Average_CollisionsPerMonth = mean(n())) %>% na.omit()

ggplot(data=borough.accident,aes(x=DATE, y=Average_CollisionsPerMonth, colour=BOROUGH, group=BOROUGH)) + geom_line()+geom_point(size=0.5,shape=1) + ggtitle("Average Accidents per month by Borough")+geom_text(aes(label=ifelse(Average_CollisionsPerMonth>180,Average_CollisionsPerMonth,"")), size=2,hjust=1.5)

The result seems quite obvious.
The overall ranking of the boroughs from the most accidents happened to the least is:
1. Brooklyn
2. Queens
3. Manhattan
4. Bronx
5. Staten Island

5) Type of vehicles

In order to find the type of vehicles (pickup truck, passenger, taxi, etc.) most frequently caused a collision, we add the two vehicle types together and sort the 10 most common types of vehicles based on the number of collisions. We did this by creating a union of vehicle_1 and vehicle_2 from q5.1 and q5.2, as there are usually 2 vehicles involved in a collision.

q5.1 <- nypd.vehicle %>%
  rename(VEHICLE_TYPE = VEHICLE_TYPE_CODE_1) %>%
  group_by(VEHICLE_TYPE) %>%
  summarise(Number_of_Accidents = n()) %>%
  mutate(Type = "VEHICLE_TYPE_CODE_1")
  
q5.2 <- nypd.vehicle %>%
  rename(VEHICLE_TYPE = VEHICLE_TYPE_CODE_2) %>%
  group_by(VEHICLE_TYPE) %>%
  summarise(Number_of_Accidents = n()) %>%
  mutate(Type = "VEHICLE_TYPE_CODE_2")

q2 <- union_all(q5.1, q5.2) %>%
  filter(!VEHICLE_TYPE %in% c("UNKNOWN", "OTHER", NA)) %>%
  group_by(Type) %>%
  arrange(-Number_of_Accidents) %>%
  top_n(10,Number_of_Accidents)
knitr::kable(q2, caption = "Number of Collisions by Vehicle Type")
Number of Collisions by Vehicle Type
VEHICLE_TYPE Number_of_Accidents Type
PASSENGER VEHICLE 662184 VEHICLE_TYPE_CODE_1
PASSENGER VEHICLE 496070 VEHICLE_TYPE_CODE_2
SPORT UTILITY / STATION WAGON 274688 VEHICLE_TYPE_CODE_1
SPORT UTILITY / STATION WAGON 205959 VEHICLE_TYPE_CODE_2
TAXI 44544 VEHICLE_TYPE_CODE_1
TAXI 36640 VEHICLE_TYPE_CODE_2
VAN 26846 VEHICLE_TYPE_CODE_1
BICYCLE 25180 VEHICLE_TYPE_CODE_2
VAN 23905 VEHICLE_TYPE_CODE_2
PICK-UP TRUCK 20446 VEHICLE_TYPE_CODE_1
PICK-UP TRUCK 17816 VEHICLE_TYPE_CODE_2
SMALL COM VEH(4 TIRES) 14874 VEHICLE_TYPE_CODE_1
SMALL COM VEH(4 TIRES) 14627 VEHICLE_TYPE_CODE_2
LARGE COM VEH(6 OR MORE TIRES) 14117 VEHICLE_TYPE_CODE_1
BUS 14044 VEHICLE_TYPE_CODE_1
LARGE COM VEH(6 OR MORE TIRES) 13367 VEHICLE_TYPE_CODE_2
BUS 11460 VEHICLE_TYPE_CODE_2
LIVERY VEHICLE 9615 VEHICLE_TYPE_CODE_1
LIVERY VEHICLE 7704 VEHICLE_TYPE_CODE_2
MOTORCYCLE 6607 VEHICLE_TYPE_CODE_1

We then created a bar graph that illustrated the 10 most frequent car types that got into a collision, broken down by the 2 vehicle codes and how many collisions there were involving those vehicle types.

ggplot(data = q2,
       aes(x = VEHICLE_TYPE, y =  Number_of_Accidents, fill = Type)) +
   geom_col(position = "stack") +
  ggtitle("Number of Collisions by Vehicle Type") + ylab("Number of collisions") + xlab("Vehicle_Type") +
  theme(axis.text.x  = element_text(angle = 0),
        text = element_text(size = 10, hjust=0.5, vjust = 1, face = "bold")) + coord_flip()

From the plot, passanger vehicles are involved in much more accidents than any other vehicles. Sport utility/station wagon come in second, involved in less accidents than the passanger vehicles but still a lot relative to even the third place vehicle. The rest seems similar after taxis.

7) Time-interval

In order to figure out certain time periods that the collision occured more frequently, we split one day into 24 hours and count the number of collisions happened in each time interval.

# 60 minute
q5.1 <- nypd.vehicle %>%
  group_by(TIME) %>%
  summarise(n = n()) %>%
  mutate(time = hms(TIME),
         time = hour(time)*3600 + minute(time)*60 + second(time),
         times = ceiling(time/3600),
         time.end = time/3600
         )

q5.interval <- q5.1 %>%
  filter(times == time.end) %>%
  union_all(q5.1 %>% top_n(1)) %>%
  select(times, TIME)
## Selecting by time.end
q5.interval <- q5.interval %>%
  rename(start_time = TIME) %>%
  inner_join(
    q5.interval %>%
      rename(end_time = TIME) %>%
  mutate(times = times - 1) 
  )
## Joining, by = "times"
q5.2 <- q5.1 %>%
  group_by(times) %>%
  summarise(n = sum(n)) %>%
  inner_join(q5.interval) %>%
  arrange(-n) %>% 
  mutate(Collisions = n) %>%
  select(start_time, end_time, Collisions) %>%
  mutate(time_interval = paste0(start_time, "->", end_time)) %>%
  arrange(start_time)
## Joining, by = "times"
knitr::kable(q5.2, caption = "Most frequent 1-hour intervals for collisions to occur")
Most frequent 1-hour intervals for collisions to occur
start_time end_time Collisions time_interval
00:00:00 01:00:00 4683 00:00:00->01:00:00
01:00:00 02:00:00 28900 01:00:00->02:00:00
02:00:00 03:00:00 17467 02:00:00->03:00:00
03:00:00 04:00:00 13518 03:00:00->04:00:00
04:00:00 05:00:00 11826 04:00:00->05:00:00
05:00:00 06:00:00 14170 05:00:00->06:00:00
06:00:00 07:00:00 15668 06:00:00->07:00:00
07:00:00 08:00:00 25501 07:00:00->08:00:00
08:00:00 09:00:00 37082 08:00:00->09:00:00
09:00:00 10:00:00 66095 09:00:00->10:00:00
10:00:00 11:00:00 63088 10:00:00->11:00:00
11:00:00 12:00:00 59013 11:00:00->12:00:00
12:00:00 13:00:00 62297 12:00:00->13:00:00
13:00:00 14:00:00 66217 13:00:00->14:00:00
14:00:00 15:00:00 69425 14:00:00->15:00:00
15:00:00 16:00:00 80642 15:00:00->16:00:00
16:00:00 17:00:00 71858 16:00:00->17:00:00
17:00:00 18:00:00 86260 17:00:00->18:00:00
18:00:00 19:00:00 82322 18:00:00->19:00:00
19:00:00 20:00:00 70778 19:00:00->20:00:00
20:00:00 21:00:00 57371 20:00:00->21:00:00
21:00:00 22:00:00 47917 21:00:00->22:00:00
22:00:00 23:00:00 40058 22:00:00->23:00:00
23:00:00 23:59:00 36018 23:00:00->23:59:00
ggplot(data = q5.2,
       aes(x = seq(0,23,1), y =  Collisions, fill = time_interval)) +
   geom_col(position = "stack") +
  ggtitle("Collisions by Hour") + xlab("Time(hrs)") + ylab("Number of collisions")+
    theme(axis.text.x  = element_text(angle = 0),
        text = element_text(hjust=0.5, vjust = 1, size = 10, face = "bold"),
        legend.position="none")

From the plot, we found that the number of accidents rises suddenly between 8am and 9am and continues to rise through 3 pm. The maximum hour in terms of accidents was at 5pm.